{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#encoding=utf8\n",
      "# \u627e\u51fa \u6700\u76f8\u8fd1\u7684\u6587\u7ae0\n",
      "\n",
      "import pymongo\n",
      "import sys\n",
      "import bson\n",
      "sys.path.append('..')\n",
      "import mysimhash \n",
      "Article = pymongo.Connection().stock.Article\n",
      "\n",
      "\n",
      "def pr(t):\n",
      "    print json.dumps(t, ensure_ascii=False,indent=2)\n",
      "id=0\n",
      "bu = mysimhash.SimhashIndex({},k=50)\n",
      "for i in Article.find({},{\"_id\":1,\"sim\":1}).limit(10000):\n",
      "    obj_id = i['_id']\n",
      "    hash_int = mysimhash.Simhash(int(i['sim']))\n",
      "    bu.add( obj_id,hash_int)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "bu.k=13\n",
      "for item in Article.find().limit(10):\n",
      "    obj_id = str(item['_id'])\n",
      "    sim = mysimhash.Simhash(int(item['sim']))\n",
      "    articles = bu.get_near_dups(sim)\n",
      "    print item['title'],item['_id'],item['url']\n",
      "    for i in articles:\n",
      "        if i[1]==obj_id:continue\n",
      "        art = Article.find_one({\"_id\":bson.objectid.ObjectId(i[1])})\n",
      "        print '\\t',art['title'],art['_id'],art['url']\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\u201c\u6b27\u6d32\u94f6\u76d1\u4f1a\u201d\u83b7\u6279 \u96b6\u5c5e\u4e8e\u6b27\u592e\u884c 537613a67f949f18042cb731 http://stock.eastmoney.com/news/1438,20140417377187644.html\n",
        "\t\u591a\u7a7a\u5c55\u5f00\u62c9\u952f\u6218 \u5e02\u573a\u5e73\u6de1\u98d8\u8fc7 537615117f949f18042cbf77 http://stock.eastmoney.com/news/1412,20140514384321731.html\n",
        "\t\u8bc1\u5238\u4ea4\u6613\u516c\u5e73\u6027\u4e0e\u5b9a\u4ef7\u900f\u660e\u5316\u7684\u535a\u5f08\u89e3\u5bc6 5376136c7f949f18042cb609 http://stock.eastmoney.com/news/1436,20140421378049195.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t\u5fb7\u56fd\u8d22\u957f\u6714\u4f0a\u5e03\u52d2\uff1a\u5e0c\u814a\u7ecf\u6d4e\u589e\u957f\u524d\u666f\u5df2\u7ecf\u5f97\u5230\u5de9\u56fa 5376135b7f949f18042cb593 http://stock.eastmoney.com/news/1438,20140402373614414.html\n",
        "\t\u65b0\u4e09\u677f\u6302\u724c\u9996\u65e5\u906d\u7206\u7092 \u96c5\u5a01\u7279\u6da8\u903e70\u500d 537613fb7f949f18042cb944 http://stock.eastmoney.com/news/1505,20140125356631634.html\n",
        "\u5fb7\u62c9\u5409\u5439\u5bbd\u677e\u98ce \u8d44\u91d1\u6d8c\u5165\u6b27\u6d32\u5e02\u573a\u6284\u5e95"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 537613a77f949f18042cb733 http://stock.eastmoney.com/news/1438,20140418377595821.html\n",
        "\t\u4e4c\u514b\u5170\u5c40\u52bf\u6076\u5316\u5f15\u53d1\u62c5\u5fe7 \u7f8e\u80a1\u5c0f\u5e45\u6536\u4f4e 537613d27f949f18042cb843 http://stock.eastmoney.com/news/1436,20140503381418384.html\n",
        "\t\u521b\u4e1a\u677f\u518d\u878d\u8d44 \u522b\u8d70\u4e3b\u677f\u5708\u94b1\u8001\u8def 537614ed7f949f18042cbea3 http://stock.eastmoney.com/news/1413,20140507382345435.html\n",
        "\t\u7269\u8054\u7f51\u677f\u575710\u5927\u6982\u5ff5\u80a1\u4ef7\u503c\u89e3\u6790 5376160c7f949f18042cc6c9 http://stock.eastmoney.com/news/1414,20140219361360088.html\n",
        "\t\u7f57\u83b1\u5bb6\u7eba\u8c03\u7814\u7b80\u62a5:\u4e1a\u7ee9\u6709\u5411\u597d\u8d8b\u52bf \u7535\u5546\u4ecd\u5927\u6709\u53ef\u4e3a 5376177a7f949f18042cd11c http://stock.eastmoney.com/news/1417,20140313367932890.html\n",
        "\t\u8bc1\u5238\u4ea4\u6613\u516c\u5e73\u6027\u4e0e\u5b9a\u4ef7\u900f\u660e\u5316\u7684\u535a\u5f08\u89e3\u5bc6 5376136c7f949f18042cb609 http://stock.eastmoney.com/news/1436,20140421378049195.html\n",
        "\u4e13\u5bb6\uff1a\u4f1a\u8c08\u53ea\u662f\u666e\u4eac\u7f13\u5175\u4e4b\u8ba1 \u4fc4\u7f57\u65af\u91ce\u5fc3\u672a\u9000 537613ad7f949f18042cb76c http://stock.eastmoney.com/news/1438,20140422378358290.html\n",
        "\u5fb7\u56fd\u6cd5\u5170\u514b\u798f\u80a1\u5e02DAX\u6307\u657022\u65e5\u4e0a\u6da8190.38\u70b9 537613b27f949f18042cb77d http://stock.eastmoney.com/news/1438,20140423378897218.html\n",
        "\u673a\u4f1a\u6765\u4e86\uff01\u6b27\u6d32\u590d\u82cf\u4e2d\u7684\u4e24\u4e2a\u6295\u8d44\u673a\u4f1a"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 537613b47f949f18042cb785 http://stock.eastmoney.com/news/1438,20140423379296724.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t(12/15)\u665a\u95f4\u6caa\u6df1\u4e0a\u5e02\u516c\u53f8\u91cd\u5927\u4e8b\u9879\u516c\u544a\u6700\u65b0\u5feb\u9012 537614b37f949f18042cbda5 http://stock.eastmoney.com/news/1398,20131215345943321.html\n",
        "\t\u533b\u836f\u76c8\u5229\u589e\u957f\u4f30\u503c\u8c03\u6574\u5230\u4f4d 5376175a7f949f18042cd03e http://stock.eastmoney.com/news/1421,20140510383332156.html\n",
        "\t\u6280\u672f\u9762\u652f\u6301\u7ee7\u7eed\u53cd\u5f39 5376166f7f949f18042cc971 http://stock.eastmoney.com/news/1407,20140513383748014.html\n",
        "\t\u57fa\u672c\u9762\u56e0\u7d20\u5fae\u5999\u53d8\u5316 \u53cd\u5f39\u62d0\u70b9\u663e\u73b0 537613887f949f18042cb6b0 http://stock.eastmoney.com/news/1412,20140408374654561.html\n",
        "\u7f8e\u56fd2\u6708\u670d\u52a1\u4e1aPMI\u7ec8\u503c\u4e0b\u964d\u81f353.3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 537613e77f949f18042cb8be http://stock.eastmoney.com/news/1436,20140505381826187.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t(12/15)\u665a\u95f4\u6caa\u6df1\u4e0a\u5e02\u516c\u53f8\u91cd\u5927\u4e8b\u9879\u516c\u544a\u6700\u65b0\u5feb\u9012 537614b37f949f18042cbda5 http://stock.eastmoney.com/news/1398,20131215345943321.html\n",
        "\t\u533b\u836f\u76c8\u5229\u589e\u957f\u4f30\u503c\u8c03\u6574\u5230\u4f4d 5376175a7f949f18042cd03e http://stock.eastmoney.com/news/1421,20140510383332156.html\n",
        "\t\u6280\u672f\u9762\u652f\u6301\u7ee7\u7eed\u53cd\u5f39 5376166f7f949f18042cc971 http://stock.eastmoney.com/news/1407,20140513383748014.html\n",
        "\t\u57fa\u672c\u9762\u56e0\u7d20\u5fae\u5999\u53d8\u5316 \u53cd\u5f39\u62d0\u70b9\u663e\u73b0 537613887f949f18042cb6b0 http://stock.eastmoney.com/news/1412,20140408374654561.html\n",
        "\u670d\u52a1\u4e1a\u6307\u6570\u6500\u5347 \u7f8e\u80a1\u6536\u9ad8"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 537613ea7f949f18042cb8d0 http://stock.eastmoney.com/news/1436,20140506382122252.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t(12/15)\u665a\u95f4\u6caa\u6df1\u4e0a\u5e02\u516c\u53f8\u91cd\u5927\u4e8b\u9879\u516c\u544a\u6700\u65b0\u5feb\u9012 537614b37f949f18042cbda5 http://stock.eastmoney.com/news/1398,20131215345943321.html\n",
        "\t\u533b\u836f\u76c8\u5229\u589e\u957f\u4f30\u503c\u8c03\u6574\u5230\u4f4d 5376175a7f949f18042cd03e http://stock.eastmoney.com/news/1421,20140510383332156.html\n",
        "\t\u6280\u672f\u9762\u652f\u6301\u7ee7\u7eed\u53cd\u5f39 5376166f7f949f18042cc971 http://stock.eastmoney.com/news/1407,20140513383748014.html\n",
        "\t\u57fa\u672c\u9762\u56e0\u7d20\u5fae\u5999\u53d8\u5316 \u53cd\u5f39\u62d0\u70b9\u663e\u73b0 537613887f949f18042cb6b0 http://stock.eastmoney.com/news/1412,20140408374654561.html\n",
        "\u7ebd\u7ea6\u63a8\u51fa\u5168\u7f8e\u6700\u5927\u89c4\u6a21\u7ecf\u6d4e\u9002\u7528\u623f\u8ba1\u5212 537613ea7f949f18042cb8d3 http://stock.eastmoney.com/news/1436,20140506382028217.html\n",
        "\t\u4e4c\u514b\u5170\u5c40\u52bf\u6076\u5316\u5f15\u53d1\u62c5\u5fe7 \u7f8e\u80a1\u5c0f\u5e45\u6536\u4f4e 537613d27f949f18042cb843 http://stock.eastmoney.com/news/1436,20140503381418384.html\n",
        "\t\u521b\u4e1a\u677f\u518d\u878d\u8d44 \u522b\u8d70\u4e3b\u677f\u5708\u94b1\u8001\u8def 537614ed7f949f18042cbea3 http://stock.eastmoney.com/news/1413,20140507382345435.html\n",
        "\t\u7269\u8054\u7f51\u677f\u575710\u5927\u6982\u5ff5\u80a1\u4ef7\u503c\u89e3\u6790 5376160c7f949f18042cc6c9 http://stock.eastmoney.com/news/1414,20140219361360088.html\n",
        "\t\u7f57\u83b1\u5bb6\u7eba\u8c03\u7814\u7b80\u62a5:\u4e1a\u7ee9\u6709\u5411\u597d\u8d8b\u52bf \u7535\u5546\u4ecd\u5927\u6709\u53ef\u4e3a 5376177a7f949f18042cd11c http://stock.eastmoney.com/news/1417,20140313367932890.html\n",
        "\t\u8bc1\u5238\u4ea4\u6613\u516c\u5e73\u6027\u4e0e\u5b9a\u4ef7\u900f\u660e\u5316\u7684\u535a\u5f08\u89e3\u5bc6 5376136c7f949f18042cb609 http://stock.eastmoney.com/news/1436,20140421378049195.html\n",
        "\u878d\u8d44\u8d44\u91d1\u4e70\u5165\u4e2d\u4fe1\u94f6\u884c"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 5376141b7f949f18042cba0d http://stock.eastmoney.com/news/1423,20140313367866083.html\n",
        "\t\u5b9d\u65b0\u80fd\u6e90\uff1a\u52a0\u5feb\u6218\u7565\u8f6c\u578b \u672a\u6765\u53d1\u5c55\u53ef\u671f 537616307f949f18042cc7d1 http://stock.eastmoney.com/news/1417,20140124356499429.html\n",
        "\t\u56fd\u8054\u8bc1\u5238\uff1a\u7ed9\u4e88\u5929\u58d5\u8282\u80fd\u201c\u63a8\u8350\u201d\u8bc4\u7ea7 537617437f949f18042ccf9c http://stock.eastmoney.com/news/1478,20140228364225469.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t\u7f8e\u6307\u8d23\u4fc4\u7f57\u65af\u662f\u4e4c\u514b\u5170\u4e1c\u90e8\u6df7\u4e71\u201c\u50ac\u5316\u5242\u201d 537613477f949f18042cb508 http://stock.eastmoney.com/news/1438,20140409374952688.html\n",
        "\t(12/15)\u665a\u95f4\u6caa\u6df1\u4e0a\u5e02\u516c\u53f8\u91cd\u5927\u4e8b\u9879\u516c\u544a\u6700\u65b0\u5feb\u9012 537614b37f949f18042cbda5 http://stock.eastmoney.com/news/1398,20131215345943321.html\n",
        "\u6613\u4e16\u8fbe2013\u5e74\u4e1a\u7ee9\u9884\u589e \u76c8\u5229\u80fd\u529b\u4fdd\u6301\u7a33\u5b9a"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 537614667f949f18042cbbe3 http://stock.eastmoney.com/news/1399,20140127356989684.html\n",
        "\t3\u5927\u4fe1\u53f7\u5b9a\u8c03\u5f3a\u52bf \u70ed\u70b9\u518d\u6b21\u71c3\u70e7 537613817f949f18042cb682 http://stock.eastmoney.com/news/1412,20140410375553532.html\n",
        "\t(12/15)\u665a\u95f4\u6caa\u6df1\u4e0a\u5e02\u516c\u53f8\u91cd\u5927\u4e8b\u9879\u516c\u544a\u6700\u65b0\u5feb\u9012 537614b37f949f18042cbda5 http://stock.eastmoney.com/news/1398,20131215345943321.html\n",
        "\t\u533b\u836f\u76c8\u5229\u589e\u957f\u4f30\u503c\u8c03\u6574\u5230\u4f4d 5376175a7f949f18042cd03e http://stock.eastmoney.com/news/1421,20140510383332156.html\n",
        "\t\u6280\u672f\u9762\u652f\u6301\u7ee7\u7eed\u53cd\u5f39 5376166f7f949f18042cc971 http://stock.eastmoney.com/news/1407,20140513383748014.html\n",
        "\t\u57fa\u672c\u9762\u56e0\u7d20\u5fae\u5999\u53d8\u5316 \u53cd\u5f39\u62d0\u70b9\u663e\u73b0 537613887f949f18042cb6b0 http://stock.eastmoney.com/news/1412,20140408374654561.html\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import jieba"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import jieba.analyse\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%time\n",
      "for i in Article.find():\n",
      "    keyword = jieba.analyse.extract_tags(html2text(i['body'])) \n",
      "    i['keyword']=keyword\n",
      "    Article.save(i)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 0 ns\n"
       ]
      }
     ],
     "prompt_number": 39
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}